Delivery Standardization

Exploratory Data Analysis | IS 6813

Author

Adam Bushman (u6049169)

Published

January 15, 2025

import polars as pl
import duckdb
import skimpy
import folium

from plotnine import (
    ggplot, aes, 
    geom_point, geom_boxplot, 
    scale_y_log10, scale_x_log10, 
    theme, coord_flip
)
# Setup DuckDB connection
con = duckdb.connect()

# Reference CSVs
cust_addr_zip_map = 'data/customer_address_and_zip_mapping.csv'
customer_profile = 'data/customer_profile.csv'
transaction_data = 'data/transactional_data.csv'
delivery_cost = 'data/delivery_cost_data.xlsx'
# For reading Excel documents
con.execute("""
INSTALL spatial;
LOAD spatial;
""");
# Load CSVs to persistent tables
con.execute(f"""
    CREATE TABLE cust_addr AS (
        SELECT * FROM read_csv_auto('{cust_addr_zip_map}')
    );

    CREATE TABLE cust_profile AS (
        SELECT * FROM read_csv_auto('{customer_profile}')
    );

    CREATE TABLE transactions AS (
        SELECT * FROM read_csv_auto('{transaction_data}')
    );

    CREATE TABLE delivery_cost AS (
        SELECT * FROM st_read('{delivery_cost}')
    );
""")
<duckdb.duckdb.DuckDBPyConnection at 0x7a8f5b5083f0>
con.sql("SHOW TABLES")
┌───────────────┐
│     name      │
│    varchar    │
├───────────────┤
│ cust_addr     │
│ cust_profile  │
│ delivery_cost │
│ transactions  │
└───────────────┘

customer_profile.csv

We can now use SQL to query the cust_profile table. In the case where we need some programming, we can just create a polars dataframe:

cust_profile_df = con.sql("FROM cust_profile").pl()

Let’s first look at the columns:

con.sql("DESCRIBE cust_profile")
┌──────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│     column_name      │ column_type │  null   │   key   │ default │  extra  │
│       varchar        │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ CUSTOMER_NUMBER      │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ PRIMARY_GROUP_NUMBER │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ FREQUENT_ORDER_TYPE  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ FIRST_DELIVERY_DATE  │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ ON_BOARDING_DATE     │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ COLD_DRINK_CHANNEL   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ TRADE_CHANNEL        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ SUB_TRADE_CHANNEL    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ LOCAL_MARKET_PARTNER │ BOOLEAN     │ YES     │ NULL    │ NULL    │ NULL    │
│ CO2_CUSTOMER         │ BOOLEAN     │ YES     │ NULL    │ NULL    │ NULL    │
│ ZIP_CODE             │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
├──────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤
│ 11 rows                                                          6 columns │
└────────────────────────────────────────────────────────────────────────────┘

11 columns. All the data types seem reasonable. Let’s assess sparcity of the data:

cust_profile_df.null_count()
shape: (1, 11)
CUSTOMER_NUMBER PRIMARY_GROUP_NUMBER FREQUENT_ORDER_TYPE FIRST_DELIVERY_DATE ON_BOARDING_DATE COLD_DRINK_CHANNEL TRADE_CHANNEL SUB_TRADE_CHANNEL LOCAL_MARKET_PARTNER CO2_CUSTOMER ZIP_CODE
u32 u32 u32 u32 u32 u32 u32 u32 u32 u32 u32
0 18196 0 0 0 0 0 0 0 0 0

We have mostly categorical features in here. Let’s determine some of the distributions of these, such as…

How many groups/individual customers are there? (remember, many individual customers could roll up to a group)

con.sql("""
    SELECT
    COUNT(DISTINCT COALESCE(PRIMARY_GROUP_NUMBER, CUSTOMER_NUMBER)) AS TOT_CUST
    FROM cust_profile
""")
┌──────────┐
│ TOT_CUST │
│  int64   │
├──────────┤
│    19216 │
└──────────┘

There’s a few dozen groups with many customers. Most are single customers, however.

con.sql("""
    SELECT
    COALESCE(PRIMARY_GROUP_NUMBER, CUSTOMER_NUMBER) AS CUST_ID
    ,COUNT(*)
    FROM cust_profile
    GROUP BY ALL
    ORDER BY COUNT(*) DESC
""")
┌───────────┬──────────────┐
│  CUST_ID  │ count_star() │
│   int64   │    int64     │
├───────────┼──────────────┤
│       437 │          670 │
│      2487 │          616 │
│       340 │          558 │
│      1194 │          365 │
│      2183 │          307 │
│      1431 │          237 │
│       404 │          219 │
│       104 │          177 │
│      1594 │          159 │
│       384 │          158 │
│        ·  │            · │
│        ·  │            · │
│        ·  │            · │
│ 600569534 │            1 │
│ 501625173 │            1 │
│ 500636951 │            1 │
│ 500944185 │            1 │
│ 501213180 │            1 │
│ 501681374 │            1 │
│ 600083643 │            1 │
│ 600569505 │            1 │
│ 600069200 │            1 │
│ 501487334 │            1 │
├───────────┴──────────────┤
│ ? rows         2 columns │
└──────────────────────────┘

What kind of order frequency is seen across these?

con.sql("""
    SELECT FREQUENT_ORDER_TYPE, COUNT(*) AS VOL
    FROM cust_profile
    GROUP BY FREQUENT_ORDER_TYPE
    ORDER BY COUNT(*) DESC
""")
┌─────────────────────┬───────┐
│ FREQUENT_ORDER_TYPE │  VOL  │
│       varchar       │ int64 │
├─────────────────────┼───────┤
│ SALES REP           │ 20017 │
│ OTHER               │  5378 │
│ MYCOKE360           │  2386 │
│ CALL CENTER         │  1375 │
│ MYCOKE LEGACY       │   981 │
│ EDI                 │   341 │
└─────────────────────┴───────┘
con.sql("""
    SELECT COLD_DRINK_CHANNEL, COUNT(*) AS VOL
    FROM cust_profile
    GROUP BY COLD_DRINK_CHANNEL
    ORDER BY COUNT(*) DESC
""")
┌────────────────────┬───────┐
│ COLD_DRINK_CHANNEL │  VOL  │
│      varchar       │ int64 │
├────────────────────┼───────┤
│ DINING             │ 15518 │
│ GOODS              │  5829 │
│ EVENT              │  3079 │
│ PUBLIC SECTOR      │  1740 │
│ BULK TRADE         │  1323 │
│ ACCOMMODATION      │  1241 │
│ WORKPLACE          │  1201 │
│ WELLNESS           │   490 │
│ CONVENTIONAL       │    57 │
└────────────────────┴───────┘

We see mostly local market partners.

con.sql("""
    SELECT LOCAL_MARKET_PARTNER, COUNT(*) AS VOL
    FROM cust_profile
    GROUP BY LOCAL_MARKET_PARTNER
    ORDER BY COUNT(*) DESC
""")
┌──────────────────────┬───────┐
│ LOCAL_MARKET_PARTNER │  VOL  │
│       boolean        │ int64 │
├──────────────────────┼───────┤
│ true                 │ 27355 │
│ false                │  3123 │
└──────────────────────┴───────┘

However, if we intersect by group, there may be more to the story:

con.sql("""
    SELECT
    LOCAL_MARKET_PARTNER,
    SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 1 ELSE 0 END) AS CUST,
    SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 0 ELSE 1 END) AS GROUP
    FROM cust_profile
    GROUP BY LOCAL_MARKET_PARTNER
""")
┌──────────────────────┬────────┬────────┐
│ LOCAL_MARKET_PARTNER │  CUST  │ GROUP  │
│       boolean        │ int128 │ int128 │
├──────────────────────┼────────┼────────┤
│ true                 │  17597 │   9758 │
│ false                │    599 │   2524 │
└──────────────────────┴────────┴────────┘

Solo customers are far more likely to be local market partners (which makes sense).

What is the distribution of buying CO2?

con.sql("""
    SELECT CO2_CUSTOMER,

    SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 1 ELSE 0 END) AS CUST,
    SUM(CASE WHEN PRIMARY_GROUP_NUMBER IS NULL THEN 0 ELSE 1 END) AS GROUP
    FROM cust_profile
    GROUP BY CO2_CUSTOMER
""")
┌──────────────┬────────┬────────┐
│ CO2_CUSTOMER │  CUST  │ GROUP  │
│   boolean    │ int128 │ int128 │
├──────────────┼────────┼────────┤
│ true         │   9456 │   2526 │
│ false        │   8740 │   9756 │
└──────────────┴────────┴────────┘

It’s about 50-50, but we see that “franchises” are far less likely to source their CO2 from Swire. So how valuable is that business?

customer_address_and_zip_mapping.csv

Let’s create our polars data frame:

cust_addr_df = con.sql("FROM cust_addr").pl()

And now look at the columns:

con.sql("DESCRIBE cust_addr")
┌──────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name  │ column_type │  null   │   key   │ default │  extra  │
│   varchar    │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ zip          │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ full address │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└──────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

We need to pull out those full addresses. We can do that by splitting the string, grabbing list elements of interest, and then properly casting.

con.execute("""
    CREATE TABLE cust_addr_detail AS (
        SELECT
        zip
        ,LIST_ELEMENT(STRING_SPLIT("full address", ','), 2) AS city
        ,LIST_ELEMENT(STRING_SPLIT("full address", ','), 3) AS state
        ,LIST_ELEMENT(STRING_SPLIT("full address", ','), 4) AS state_abbr
        ,LIST_ELEMENT(STRING_SPLIT("full address", ','), 5) AS county
        ,CAST(LIST_ELEMENT(STRING_SPLIT("full address", ','), 7) AS DOUBLE) AS lat
        ,CAST(LIST_ELEMENT(STRING_SPLIT("full address", ','), 8) AS DOUBLE) AS lon
        FROM cust_addr
    )
""")
<duckdb.duckdb.DuckDBPyConnection at 0x7a8f5b5083f0>
con.sql("DESCRIBE cust_addr_detail")
┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ zip         │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ city        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ state       │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ state_abbr  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ county      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ lat         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ lon         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

Looks great! Let’s join and see where most customers are by state.

con.sql("""
    SELECT 
    cad.state
    ,COUNT(*)
    FROM cust_profile cp
    INNER JOIN cust_addr_detail cad ON cad.zip = cp.ZIP_CODE
    GROUP BY cad.state
""")
┌───────────────┬──────────────┐
│     state     │ count_star() │
│    varchar    │    int64     │
├───────────────┼──────────────┤
│ Louisiana     │          390 │
│ Maryland      │         4901 │
│ Kansas        │         7163 │
│ Kentucky      │         6991 │
│ Massachusetts │        11033 │
└───────────────┴──────────────┘

It appears that most customers are found in Massachusetts. Let’s see if we can’t render a map:

cust_addr = con.sql("SELECT * FROM cust_addr_detail").pl()
swire_map = folium.Map(
    location = [
        cust_addr['lat'].mean(),
        cust_addr['lon'].mean()
    ],
    zoom_start = 4.5, 
    control_scale = True
)
for row in cust_addr.iter_rows():
    folium.Marker(
        location = [row[5], row[6]], 
        icon = folium.Icon(color = "red")
    ).add_to(swire_map)
swire_map
Make this Notebook Trusted to load map: File -> Trust Notebook

There is somewhat more concentraction around city centers, but not near what I would expect. Seems curious.

transactional_data.csv

Let’s create our polars data frame:

transac_df = con.sql("FROM transactions").pl()

And now look at the columns:

con.sql("DESCRIBE transactions")
┌───────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│    column_name    │ column_type │  null   │   key   │ default │  extra  │
│      varchar      │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ TRANSACTION_DATE  │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ WEEK              │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ YEAR              │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ CUSTOMER_NUMBER   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ ORDER_TYPE        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ ORDERED_CASES     │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ LOADED_CASES      │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ DELIVERED_CASES   │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ ORDERED_GALLONS   │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ LOADED_GALLONS    │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ DELIVERED_GALLONS │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
├───────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤
│ 11 rows                                                       6 columns │
└─────────────────────────────────────────────────────────────────────────┘

Let’s look at the summary statistics here:

transac_df.describe()
shape: (9, 12)
statistic TRANSACTION_DATE WEEK YEAR CUSTOMER_NUMBER ORDER_TYPE ORDERED_CASES LOADED_CASES DELIVERED_CASES ORDERED_GALLONS LOADED_GALLONS DELIVERED_GALLONS
str str f64 f64 f64 str f64 f64 f64 f64 f64 f64
"count" "1045540" 1.04554e6 1.04554e6 1.04554e6 "1045540" 1.04554e6 1.04554e6 1.04554e6 1.04554e6 1.04554e6 1.04554e6
"null_count" "0" 0.0 0.0 0.0 "0" 0.0 0.0 0.0 0.0 0.0 0.0
"mean" "2023-12-27 11:40:07.138000" 26.228599 2023.496534 5.4664e8 null 26.851646 25.922583 25.133342 9.873689 9.604892 9.208331
"std" null 14.516752 0.499988 4.9427e7 null 126.762052 122.787259 121.515626 26.469447 25.649953 25.175433
"min" "2023-01-01" 1.0 2023.0 5.00245678e8 "CALL CENTER" 0.0 0.0 -3132.0 0.0 0.0 -1792.5
"25%" "2023-06-30" 14.0 2023.0 5.0109192e8 null 0.0 0.0 0.0 0.0 0.0 0.0
"50%" "2023-12-27" 26.0 2023.0 5.01548213e8 null 7.0 7.0 6.0 0.0 0.0 0.0
"75%" "2024-06-25" 38.0 2024.0 6.00080939e8 null 18.5 18.0 17.3332 12.5 12.5 12.5
"max" "2024-12-31" 52.0 2024.0 6.00975408e8 "null" 8479.888 8171.564 8069.483 2562.5 2562.5 2292.5

No missing data, which is very good. Most customers aren’t ordering anything or very little on a per transaction date basis. Let’s evaluate on an annual basis.

con.sql("""
    SELECT
    YEAR
    ,CUSTOMER_NUMBER
    ,SUM(ORDERED_CASES) + SUM(ORDERED_GALLONS) AS ORDERED_QTY
    FROM transactions
    GROUP BY
    YEAR, CUSTOMER_NUMBER
""").pl().describe()
shape: (9, 4)
statistic YEAR CUSTOMER_NUMBER ORDERED_QTY
str f64 f64 f64
"count" 55653.0 55653.0 55653.0
"null_count" 0.0 0.0 0.0
"mean" 2023.531885 5.4083e8 689.950353
"std" 0.498987 4.8565e7 5111.046976
"min" 2023.0 5.00245678e8 0.0
"25%" 2023.0 5.01127715e8 67.0
"50%" 2024.0 5.01542171e8 170.0
"75%" 2024.0 6.00077803e8 457.5
"max" 2024.0 6.00975408e8 459175.7404

We get our first glimpse at why there’s the annual 400 gallon threshold. Over 2/3 aren’t ordering that much. Woof.

annual_data = con.sql("""
    SELECT
    t.CUSTOMER_NUMBER
    ,cp.COLD_DRINK_CHANNEL
    ,t.YEAR
    ,date_diff('week', ON_BOARDING_DATE, FIRST_DELIVERY_DATE) AS WK_RAMP_UP
    ,date_diff('year', FIRST_DELIVERY_DATE, current_date) AS YR_TENURE
    ,SUM(t.ORDERED_CASES) + SUM(t.ORDERED_GALLONS) AS TOTAL_ORDERED
    FROM transactions t
    INNER JOIN cust_profile cp ON cp.CUSTOMER_NUMBER = t.CUSTOMER_NUMBER
    GROUP BY ALL
""").pl()
annual_data.shape
(55653, 6)
(
    ggplot(annual_data)
    + geom_point(
        aes("WK_RAMP_UP", "TOTAL_ORDERED", color="COLD_DRINK_CHANNEL"),
        alpha = 0.75
    )
    + scale_y_log10()
    + scale_x_log10()
)
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/plotnine/layer.py:364: PlotnineWarning: geom_point : Removed 1 rows containing missing values.

(
    ggplot(annual_data)
    + geom_point(
        aes("YR_TENURE", "TOTAL_ORDERED", color="COLD_DRINK_CHANNEL"), 
        alpha = 0.75
    )
    + scale_y_log10()
    + scale_x_log10()
)
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10

delivery_cost_data.xlsx

Let’s create our polars data frame:

deliv_df = con.sql("FROM delivery_cost").pl()

And now look at the columns:

con.sql("DESCRIBE delivery_cost")
┌──────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│     column_name      │ column_type │  null   │   key   │ default │  extra  │
│       varchar        │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ Cold Drink Channel   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Vol Range            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Applicable To        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Median Delivery Cost │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ Cost Type            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└──────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

Let’s intersect this data with a particular customer and their orders.

cost_data = con.sql("""
    WITH 
    delivery AS (
        SELECT *
        ,LIST_ELEMENT(STRING_SPLIT(d."Vol Range", ' - '), 1) AS MIN_COST
        ,LIST_ELEMENT(STRING_SPLIT(d."Vol Range", ' - '), 2) AS MAX_COST
        FROM delivery_cost d
    ),
    delivery_cleaned AS (
        SELECT *
        ,CAST((CASE 
            WHEN MIN_COST LIKE '%+' THEN trim(MIN_COST, '+') 
            ELSE MIN_COST END
            ) AS INT) AS MIN_COST_VOL
        ,COALESCE(CAST(MAX_COST AS INT), 2147483647) AS MAX_COST_VOL
        FROM delivery
    ), 
    joined AS (
        SELECT 
        t.YEAR
        ,t.CUSTOMER_NUMBER
        ,c.COLD_DRINK_CHANNEL
        ,c.FREQUENT_ORDER_TYPE
        ,SUM(CASE WHEN ORDERED_CASES > 0 OR ORDERED_GALLONS > 0 THEN 1 ELSE 0 END) AS ORDER_CNT
        ,SUM(ORDERED_CASES) AS ANNUAL_CASES
        ,SUM(ORDERED_GALLONS) AS ANNUAL_GALLONS
        FROM transactions t
        INNER JOIN cust_profile c ON c.CUSTOMER_NUMBER = t.CUSTOMER_NUMBER
        GROUP BY ALL
    ),
    formatted AS (
        SELECT 
        j.*
        ,d2."Median Delivery Cost" AS COST_CASES
        ,d1."Median Delivery Cost" AS COST_GALLONS
        ,(d1."Median Delivery Cost" * j.ANNUAL_GALLONS) + (d2."Median Delivery Cost" * j.ANNUAL_CASES) AS DELIVERY_COST
        ,((d1."Median Delivery Cost" * j.ANNUAL_GALLONS) + (d2."Median Delivery Cost" * j.ANNUAL_CASES)) / ORDER_CNT AS DELIVERY_COST_PER_ORDER
        FROM joined j
        LEFT JOIN delivery_cleaned d1 ON d1."Cold Drink Channel" = j.COLD_DRINK_CHANNEL
            AND d1."Applicable To" = 'Fountain'
            AND j.ANNUAL_GALLONS BETWEEN d1.MIN_COST_VOL AND d1.MAX_COST_VOL
        LEFT JOIN delivery_cleaned d2 ON d2."Cold Drink Channel" = j.COLD_DRINK_CHANNEL
            AND d2."Applicable To" = 'Bottles and Cans'
            AND j.ANNUAL_CASES BETWEEN d2.MIN_COST_VOL AND d2.MAX_COST_VOL
    )

    FROM formatted
""").pl()
(
    ggplot(cost_data)
    + geom_boxplot(
        aes(x="COLD_DRINK_CHANNEL", y="DELIVERY_COST")
    )
    + scale_y_log10()
    + coord_flip()
)
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/plotnine/layer.py:284: PlotnineWarning: stat_boxplot : Removed 441 rows containing non-finite values.

(
    ggplot(cost_data)
    + geom_boxplot(
        aes(x="FREQUENT_ORDER_TYPE", y="DELIVERY_COST")
    )
    + scale_y_log10()
    + coord_flip()
)
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log10
/home/adam-bushman/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/plotnine/layer.py:284: PlotnineWarning: stat_boxplot : Removed 441 rows containing non-finite values.

con.sql("""
SELECT 
SUM(CASE WHEN DELIVERED_GALLONS < 0 OR DELIVERED_CASES < 0 THEN 1 ELSE 0 END)
,SUM(CASE WHEN DELIVERED_GALLONS < 0 OR DELIVERED_CASES < 0 THEN 1 ELSE 0 END) / COUNT(*)
,COUNT(*)
FROM transactions t
""")
┌──────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────┐
│ sum(CASE  WHEN (((DELIVERED_GALLONS < 0) OR (DELIVERED_CASES < 0))) THEN (1) ELSE 0 END) │ (sum(CASE  WHEN (((DELIVERED_GALLONS < 0) OR (DELIVERED_CASES < 0))) THEN (1) ELSE 0 END) / count_star()) │ count_star() │
│                                          int128                                          │                                                  double                                                   │    int64     │
├──────────────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────┤
│                                                                                     4221 │                                                                                      0.004037148267880712 │      1045540 │
└──────────────────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┘
con.sql("""
SELECT 
COUNT(DISTINCT COALESCE(cp.PRIMARY_GROUP_NUMBER, t.CUSTOMER_NUMBER))
FROM transactions t
INNER JOIN cust_profile cp ON cp.CUSTOMER_NUMBER = t.CUSTOMER_NUMBER
WHERE DELIVERED_GALLONS < 0 OR DELIVERED_CASES < 0
""")
┌──────────────────────────────────────────────────────────────────────┐
│ count(DISTINCT COALESCE(cp.PRIMARY_GROUP_NUMBER, t.CUSTOMER_NUMBER)) │
│                                int64                                 │
├──────────────────────────────────────────────────────────────────────┤
│                                                                 1969 │
└──────────────────────────────────────────────────────────────────────┘